Lab 3.6

3.6.1 Libraries

library(MASS)
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
## 
##     Boston

3.6.2 Simple Linear Regression

Using the lm() function to fit a simple linear regression model, with medv as the response and lstat as the predictor.

head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7
attach(Boston)
lm.fit <- lm(medv ~ lstat)
summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ lstat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16

Accessing information within lm.fit

names(lm.fit)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
confint(lm.fit)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505

The predict() function can be used to produce confidence intervals and prediction intervals for the prediction of medv for a given value of lstat

predict(lm.fit, data.frame(lstat = (c(5, 10, 15))), interval = "confidence")
##        fit      lwr      upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))), interval = "prediction")
##        fit       lwr      upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310  8.077742 32.52846

Plotting medv and lstat along with the least squares regression line using the plot() and abline() functions

plot(lstat, medv, pch = "+")
abline(lm.fit, lwd = 3, col = "red")

plot (1:20, 1:20, pch = 1:20)

par(mfrow = c(2 , 2))
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))
plot (hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
## 375 
## 375

3.6.3 Multiple Linear Regression

The syntax lm(y ∼ x1 + x2 + x3) is used to fit a model with three predictors, x1, x2, and x3

lm.fit = lm(medv ~ lstat + age, data = Boston)
summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16

Using all 12 variables as predictors

lm.fit = lm(medv ~ ., data = Boston)
summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1304  -2.7673  -0.5814   1.9414  26.2526 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.617270   4.936039   8.431 3.79e-16 ***
## crim         -0.121389   0.033000  -3.678 0.000261 ***
## zn            0.046963   0.013879   3.384 0.000772 ***
## indus         0.013468   0.062145   0.217 0.828520    
## chas          2.839993   0.870007   3.264 0.001173 ** 
## nox         -18.758022   3.851355  -4.870 1.50e-06 ***
## rm            3.658119   0.420246   8.705  < 2e-16 ***
## age           0.003611   0.013329   0.271 0.786595    
## dis          -1.490754   0.201623  -7.394 6.17e-13 ***
## rad           0.289405   0.066908   4.325 1.84e-05 ***
## tax          -0.012682   0.003801  -3.337 0.000912 ***
## ptratio      -0.937533   0.132206  -7.091 4.63e-12 ***
## lstat        -0.552019   0.050659 -10.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.798 on 493 degrees of freedom
## Multiple R-squared:  0.7343, Adjusted R-squared:  0.7278 
## F-statistic: 113.5 on 12 and 493 DF,  p-value: < 2.2e-16

We want to remove age as a predictor due to its high p-value.

lm.fit1 <- update(lm.fit, ~ . - age)

3.6.4 Interaction Terms

lstat * age includes lstat, age, and the interaction term lstat × age as predictors

summary(lm(medv ~ lstat * age, data = Boston))
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

3.6.5 Non-linear Transformations of the Predictors

Performing a regression of medv onto lstat and lstat^2

lm.fit2 <- lm(medv ~ lstat + I(lstat^2))
summary(lm.fit2)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16

Using anova() to see how much better our new quadratic fit is compared to the linear fit

lm.fit <- lm(medv ~ lstat)
anova(lm.fit, lm.fit2)
## Analysis of Variance Table
## 
## Model 1: medv ~ lstat
## Model 2: medv ~ lstat + I(lstat^2)
##   Res.Df   RSS Df Sum of Sq     F    Pr(>F)    
## 1    504 19472                                 
## 2    503 15347  1    4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2 , 2))
plot(lm.fit2)

Trying a fifth order polynomial transformation

lm.fit5 <- lm(medv ~ poly(lstat, 5))
summary(lm.fit5)
## 
## Call:
## lm(formula = medv ~ poly(lstat, 5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5433  -3.1039  -0.7052   2.0844  27.1153 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.5328     0.2318  97.197  < 2e-16 ***
## poly(lstat, 5)1 -152.4595     5.2148 -29.236  < 2e-16 ***
## poly(lstat, 5)2   64.2272     5.2148  12.316  < 2e-16 ***
## poly(lstat, 5)3  -27.0511     5.2148  -5.187 3.10e-07 ***
## poly(lstat, 5)4   25.4517     5.2148   4.881 1.42e-06 ***
## poly(lstat, 5)5  -19.2524     5.2148  -3.692 0.000247 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared:  0.6817, Adjusted R-squared:  0.6785 
## F-statistic: 214.2 on 5 and 500 DF,  p-value: < 2.2e-16

Trying a log transformation

summary(lm(medv ~ log(rm), data = Boston))
## 
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.487  -2.875  -0.104   2.837  39.816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -76.488      5.028  -15.21   <2e-16 ***
## log(rm)       54.055      2.739   19.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared:  0.4358, Adjusted R-squared:  0.4347 
## F-statistic: 389.3 on 1 and 504 DF,  p-value: < 2.2e-16

3.3.6 Qualitative Predictors

Given a qualitative variable such as Shelveloc, R generates dummy variables automatically. contrasts()

lm.fit <- lm(Sales ~ . + Income : Advertising + Price : Age, data = Carseats)
summary(lm.fit)
## 
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9208 -0.7503  0.0177  0.6754  3.3413 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
## CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
## Income              0.0108940  0.0026044   4.183 3.57e-05 ***
## Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
## Population          0.0001592  0.0003679   0.433 0.665330    
## Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
## ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
## ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
## Age                -0.0579466  0.0159506  -3.633 0.000318 ***
## Education          -0.0208525  0.0196131  -1.063 0.288361    
## UrbanYes            0.1401597  0.1124019   1.247 0.213171    
## USYes              -0.1575571  0.1489234  -1.058 0.290729    
## Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
## Price:Age           0.0001068  0.0001333   0.801 0.423812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared:  0.8761, Adjusted R-squared:  0.8719 
## F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16
attach(Carseats)
contrasts(ShelveLoc)
##        Good Medium
## Bad       0      0
## Good      1      0
## Medium    0      1

3.6.7 Writing Functions

Writing and calling functions in R.

LoadLibraries <- function() {
library(ISLR2)
library(MASS)
print("The libraries have been loaded.")
}
LoadLibraries()
## [1] "The libraries have been loaded."

Exercise 3.7

8) a)
Answer -

attach(Auto)
lm.fit = lm(mpg ~ horsepower)
summary(lm.fit)
## 
## Call:
## lm(formula = mpg ~ horsepower)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

8) a) i)
Answer - As value p < 2.2e-16, there is a relationship between the predictor and the response which is statistically significant.

lm.fit<-lm(mpg~horsepower,data=Auto)
summary(lm.fit)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

8) a) ii)
Answer - 60.59% of the variance in mpg is explained by horsepower.

summary(lm.fit)$r.squared
## [1] 0.6059483
summary(lm.fit)$sigma
## [1] 4.905757
summary(lm.fit)$sigma/mean(Auto$mpg)
## [1] 0.2092371

8) a) iii)
Answer - Negative

coefficients(lm.fit)["horsepower"]
## horsepower 
## -0.1578447

8) a) iv)
Answer - Prediction interval = 24.467 ± 9.658

predict(lm.fit, data.frame(horsepower = 98), interval = "confidence", level = 0.95)
##        fit      lwr      upr
## 1 24.46708 23.97308 24.96108

8) b)
Answer -

plot(horsepower, mpg)
abline(lm.fit, lwd=3, col="red")

8) c)
Answer -

par(mfrow = c(2 , 2))
plot(lm.fit)

9) a)
Answer -

pairs(Auto, pch = 19)

9) b)
Answer -

Auto$name = NULL
cor(Auto, method="pearson")
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

9) c) i)
Answer - Yes, there is a relation between the predictors and response. According to the null hypothesis all the coefficients are 0. F-statistic is used to test the hypothesis. As p-value < 2.2e-16, null hypothesis is rejected.

lm.fit<-lm(mpg~. , data=Auto)
summary(lm.fit)
## 
## Call:
## lm(formula = mpg ~ ., data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16

9) c) ii)
Answer - Displacement, weight, year, origin

9) c) iii)
Answer - mpg value increases when every other predictor is kept constant. If there is an increase of a unit in year then there will be increase of 0.7507727 unit in mpg.

coef(lm.fit)["year"]
##      year 
## 0.7507727

9) d)
Answer - The first graph shows that there is a non-linear relationship between the response and the predictors. The second graph shows that the residuals are normally distributed and right skewed. The third graph shows that the constant variance of error assumption is False. The fourth graphs shows that there are no leverage points. However, there’s one potential leverage point.

par(mfrow = c(2,2))
plot(lm.fit,col = "red")

9) e)
Answer - Yes. few interactions are statistically significant as shown by the Signif. codes/p-values in the summary below For example, acceleration and origin

lm.fit = lm(mpg ~ .*. + displacement:weight,data=Auto)
summary(lm.fit)
## 
## Call:
## lm(formula = mpg ~ . * . + displacement:weight, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6303 -1.4481  0.0596  1.2739 11.1386 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                3.548e+01  5.314e+01   0.668  0.50475   
## cylinders                  6.989e+00  8.248e+00   0.847  0.39738   
## displacement              -4.785e-01  1.894e-01  -2.527  0.01192 * 
## horsepower                 5.034e-01  3.470e-01   1.451  0.14769   
## weight                     4.133e-03  1.759e-02   0.235  0.81442   
## acceleration              -5.859e+00  2.174e+00  -2.696  0.00735 **
## year                       6.974e-01  6.097e-01   1.144  0.25340   
## origin                    -2.090e+01  7.097e+00  -2.944  0.00345 **
## cylinders:displacement    -3.383e-03  6.455e-03  -0.524  0.60051   
## cylinders:horsepower       1.161e-02  2.420e-02   0.480  0.63157   
## cylinders:weight           3.575e-04  8.955e-04   0.399  0.69000   
## cylinders:acceleration     2.779e-01  1.664e-01   1.670  0.09584 . 
## cylinders:year            -1.741e-01  9.714e-02  -1.793  0.07389 . 
## cylinders:origin           4.022e-01  4.926e-01   0.816  0.41482   
## displacement:horsepower   -8.491e-05  2.885e-04  -0.294  0.76867   
## displacement:weight        2.472e-05  1.470e-05   1.682  0.09342 . 
## displacement:acceleration -3.479e-03  3.342e-03  -1.041  0.29853   
## displacement:year          5.934e-03  2.391e-03   2.482  0.01352 * 
## displacement:origin        2.398e-02  1.947e-02   1.232  0.21875   
## horsepower:weight         -1.968e-05  2.924e-05  -0.673  0.50124   
## horsepower:acceleration   -7.213e-03  3.719e-03  -1.939  0.05325 . 
## horsepower:year           -5.838e-03  3.938e-03  -1.482  0.13916   
## horsepower:origin          2.233e-03  2.930e-02   0.076  0.93931   
## weight:acceleration        2.346e-04  2.289e-04   1.025  0.30596   
## weight:year               -2.245e-04  2.127e-04  -1.056  0.29182   
## weight:origin             -5.789e-04  1.591e-03  -0.364  0.71623   
## acceleration:year          5.562e-02  2.558e-02   2.174  0.03033 * 
## acceleration:origin        4.583e-01  1.567e-01   2.926  0.00365 **
## year:origin                1.393e-01  7.399e-02   1.882  0.06062 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared:  0.8893, Adjusted R-squared:  0.8808 
## F-statistic: 104.2 on 28 and 363 DF,  p-value: < 2.2e-16

9) f)
Answer - Among these plots, log(horsepower) seems to be most linear looking.

par(mfrow = c(2, 2))
plot(log(Auto$horsepower), Auto$mpg)
plot(sqrt(Auto$horsepower), Auto$mpg)
plot((Auto$horsepower)^2, Auto$mpg)

13) a)
Answer -

set.seed(1)
X = rnorm(100, mean = 0, sd = 1)

13) b)
Answer -

eps = rnorm(100, mean = 0, sd = sqrt(0.25))

13) c)
Answer - Lenght is 100. Beta 0 is -1 and Beta 1 is 0.5.

Y = -1 + 0.5 * X + eps
length(Y)
## [1] 100

13) d)
Answer - X and Y have a positive correlation and they have a linear relationship

plot(X, Y,col="red")

13) e)
Answer - The estimated values are very close, the presence of irreducible error prevents a perfect fit.

model1 = lm(Y ~ X)
coef(model1)
## (Intercept)           X 
##  -1.0188463   0.4994698
summary(model1)$adj.r.squared
## [1] 0.4619164

13) f)
Answer -

plot(X, Y, typ = "p")
abline(model1, col = "red", lty = "dotted")
abline(a = -1, b = 0.5, col = "blue", lty = "solid")
legend("bottomright", legend = c("actual", "estimate"), col = c("blue", "red"), lty = c("solid", "dotted"))

13) g)
Answer - No, 2 coefficient is not statistically significant. i.e.Pr(>|t|) is 1.638275e-01 (which is greater than 0.05, a common rejection threshold). As the Adjusted R squared estimate is slightly larger which implies this is a better model.

model2 = lm(Y ~ poly(X, 2))
summary(model2)$coefficients
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.9644604 0.04790161 -20.134193 2.046895e-36
## poly(X, 2)1  4.4637471 0.47901614   9.318573 3.972802e-15
## poly(X, 2)2 -0.6720309 0.47901614  -1.402940 1.638275e-01
summary(model2)$adj.r.squared
## [1] 0.4671806

13) h)
Answer - The model is more accurate as noise has decreased.

eps3 = rnorm(100, mean = 0, sd = 0.001)
y3 = -1 + 0.5 * X + eps3
model3 = lm(y3 ~ X)
coef(model3)
## (Intercept)           X 
##  -0.9999726   0.5000212
summary(model3)$adj.r.squared
## [1] 0.9999946
plot(X, y3, typ = "p")
abline(model3, col = "red", lty = "dotted")
abline(a = -1, b = 0.5, col = "blue", lty = "solid")
legend("bottomright", legend = c("actual", "estimate"), col = c("blue", "red"), lty = c("solid", "dotted"))

13) i)
Answer - The model is less accurate as noise has increased.

eps4 = rnorm(100, mean = 0, sd = 50)
y4 = -1 + 0.5 * X + eps4
model4 = lm(y4 ~ X)
coef(model4)
## (Intercept)           X 
##    1.883268   -2.284303
summary(model4)$adj.r.squared
## [1] -0.008472893
plot(X, y4, typ = "p")
abline(model4, col = "red", lty = "dotted")
abline(a = -1, b = 0.5, col = "blue", lty = "solid")
legend("bottomright", legend = c("actual", "estimate"), col = c("blue", "red"), lty = c("solid", "dotted"))

13) i)
Answer - Noisier data produces larger confidence intervals for the coefficients.

confint(model1)
##                  2.5 %     97.5 %
## (Intercept) -1.1150804 -0.9226122
## X            0.3925794  0.6063602
confint(model3)
##                  2.5 %     97.5 %
## (Intercept) -1.0001804 -0.9997648
## X            0.4997904  0.5002520
confint(model4)
##                  2.5 %   97.5 %
## (Intercept)  -8.066997 11.83353
## X           -13.336396  8.76779

14) a)
Answer = y = 2 + 2x1 + 0.3x2 where coefficients are 2, 2, 0.3

set.seed(1)
x1 = runif(100)
x2 = 0.5*x1 + rnorm(100)/10
y = 2 + 2*x1 + 0.3*x2 + rnorm(100)

14) b)
Answer =

cor(x1, x2)
## [1] 0.8351212
plot(x1, x2)

14) c)
Answer = The coefficients are 2.131, 1.44, 1.01. Checking with the p-values of our parameters, we cannot reject the hypothesis for beta 1 or beta 2 with 95% confidence. The results indicate nontrivial bias between all of the parameters.

model = lm(y ~ x1 + x2)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8311 -0.7273 -0.0537  0.6338  2.3359 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.1305     0.2319   9.188 7.61e-15 ***
## x1            1.4396     0.7212   1.996   0.0487 *  
## x2            1.0097     1.1337   0.891   0.3754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.056 on 97 degrees of freedom
## Multiple R-squared:  0.2088, Adjusted R-squared:  0.1925 
## F-statistic:  12.8 on 2 and 97 DF,  p-value: 1.164e-05

14) d)
Answer = We can confidently reject the null hypothesis now.

model = lm(y ~ x1)
summary(model)
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.89495 -0.66874 -0.07785  0.59221  2.45560 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.1124     0.2307   9.155 8.27e-15 ***
## x1            1.9759     0.3963   4.986 2.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.055 on 98 degrees of freedom
## Multiple R-squared:  0.2024, Adjusted R-squared:  0.1942 
## F-statistic: 24.86 on 1 and 98 DF,  p-value: 2.661e-06

14) e)
Answer = We can confidently reject the null hypothesis now.

model = lm(y ~ x2)
summary(model)
## 
## Call:
## lm(formula = y ~ x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.62687 -0.75156 -0.03598  0.72383  2.44890 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.3899     0.1949   12.26  < 2e-16 ***
## x2            2.8996     0.6330    4.58 1.37e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.072 on 98 degrees of freedom
## Multiple R-squared:  0.1763, Adjusted R-squared:  0.1679 
## F-statistic: 20.98 on 1 and 98 DF,  p-value: 1.366e-05

14) f)
Answer - The results do not contradict each other as the predictors are highly correlated and are collinear. Due to collinearity its difficult to determine how each predictor is associated with the response.

14) g)
Answer - In the model with two predictors, the last point is a high-leverage point. In the model with x1 as sole predictor, the last point is an outlier. In the model with x2 as sole predictor, the last point is a high leverage point.

x1 <- c(x1, 0.1)
x2 <- c(x2, 0.8)
y <- c(y, 6)
fit1 <- lm(y ~ x1 + x2)
fit2 <- lm(y ~ x1)
fit3 <- lm(y ~ x2)
summary(fit1)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73348 -0.69318 -0.05263  0.66385  2.30619 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.2267     0.2314   9.624 7.91e-16 ***
## x1            0.5394     0.5922   0.911  0.36458    
## x2            2.5146     0.8977   2.801  0.00614 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.075 on 98 degrees of freedom
## Multiple R-squared:  0.2188, Adjusted R-squared:  0.2029 
## F-statistic: 13.72 on 2 and 98 DF,  p-value: 5.564e-06
summary(fit2)
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8897 -0.6556 -0.0909  0.5682  3.5665 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.2569     0.2390   9.445 1.78e-15 ***
## x1            1.7657     0.4124   4.282 4.29e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.111 on 99 degrees of freedom
## Multiple R-squared:  0.1562, Adjusted R-squared:  0.1477 
## F-statistic: 18.33 on 1 and 99 DF,  p-value: 4.295e-05
summary(fit3)
## 
## Call:
## lm(formula = y ~ x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.64729 -0.71021 -0.06899  0.72699  2.38074 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.3451     0.1912  12.264  < 2e-16 ***
## x2            3.1190     0.6040   5.164 1.25e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.074 on 99 degrees of freedom
## Multiple R-squared:  0.2122, Adjusted R-squared:  0.2042 
## F-statistic: 26.66 on 1 and 99 DF,  p-value: 1.253e-06
plot(fit1)

plot(fit2)

plot(fit3)